from bokeh.io import show, output_notebook, output_file
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, FactorRange, Legend, HoverTool, DatetimeTickFormatter, BoxAnnotation, Toggle, CheckboxButtonGroup,CheckboxGroup
from bokeh.models import Label
from bokeh.transform import dodge
from bokeh.layouts import layout, gridplot, row, column, widgetbox
from bokeh.models.widgets import Tabs, Panel
from matplotlib.pyplot import viridis
import bokeh.palettes
from bokeh.embed import file_html
from bokeh.resources import CDN
from IPython.display import HTML
output_notebook()
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import json
from branca.colormap import linear
from folium.features import DivIcon
import folium
import math
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
%matplotlib inline
HTML('''<script>
$('div.input').hide('500')
function code_toggle() {
if (code_shown){
$('div.input').hide('500');
$('#toggleButton').val('Display code')
} else {
$('div.input').show('500');
$('#toggleButton').val('Hide Code')
}
code_shown = !code_shown
}
$( document ).ready(function(){
code_shown=false;
$('div.input').hide()
});
</script>
<form action="javascript:code_toggle()"><input type="submit" id="toggleButton" value="Display code"></form>''')
%%HTML
<style>
div.prompt {display:none}
</style>
This interactive visualization will investigate how the covid-19 lockdown in Denmark has effected public transport in the Urban Area of Copenhagen? This will be done by answering the following questions:
Let's dvelve in and answer the first question right away.
One way to measure if public transport has changed is by measuring the average speed on busses. This in fact also give an indication of how many vehicles there generally are on the roades.
It can be seen from when the lockdown started the 11. March that the average speed generally indeed got higher. Especially from the 28/03 to the 13/04 is the average speed consistently high until a bit before the start of the reopening the 15/04.
Try to hover over the graphic. Can you figure out why the average speed varies as a wave?
path = '/Users/tuethomsen28/Google Drev/SocialDataVizz/Data'
allspeed = pd.read_csv(os.path.join(path,"speedPerDay.csv"),sep=";",index_col=0)
allspeed.index = pd.to_datetime(allspeed.index)
source = ColumnDataSource(allspeed)
p = figure(plot_width=775, plot_height=300, x_axis_type="datetime",toolbar_location=None,
y_range=(25,32)
)
p.title.text = 'Avg. speed of public busses per day in Copenhagen'
color = bokeh.palettes.Category20[3]
legend_it = []
r = p.line("Date",
"Hast",
line_width=2,
color=color[1],
alpha=0.8,
muted_color=color[1],
muted_alpha=0.2,
visible=True,
source=source)
p.yaxis.axis_label = 'km/t'
hover = HoverTool(tooltips=[
('Speed: ', '@Hast km/t'),
("Date: ", '@Date{%d-%m}'),
("Day: ",'@DayOfWeek')],
formatters = {'@Date':'datetime'},
renderers=[r],
mode="vline")
p.add_tools(hover)
lockdown1_start = pd.to_datetime('20200311')
lockdown1_end = pd.to_datetime('20200318')
lockdown2_start = pd.to_datetime('20200318')
lockdown2_end = pd.to_datetime('20200415')
easter_start = pd.to_datetime('20200404')
easter_end = pd.to_datetime('20200412')
reopening_start = pd.to_datetime('20200415')
reopening_end = pd.to_datetime('20200421')
lockdown1 = BoxAnnotation(left=lockdown1_start, right=lockdown1_end,
fill_color='red', fill_alpha=0.1,visible=True)
lockdown2 = BoxAnnotation(left=lockdown2_start, right=lockdown2_end,
fill_color='red', fill_alpha=0.2,visible=True)
easter = BoxAnnotation(left=easter_start, right=easter_end,
fill_color='yellow', fill_alpha=0.2,visible=True)
reopening = BoxAnnotation(left=reopening_start, right=reopening_end,
fill_color='green', fill_alpha=0.1,visible=True)
p.add_layout(lockdown1)
p.add_layout(lockdown2)
p.add_layout(easter)
p.add_layout(reopening)
#toggle1 = Toggle(label="Lockdown vol. 1", button_type="success", active=True)
#toggle1.js_link('active', lockdown1, 'visible')
#toggle2 = Toggle(label="Lockdown vol. 2", button_type="success", active=True)
#toggle2.js_link('active', lockdown2, 'visible')
#toggle3 = Toggle(label="Easter Break",button_type="success",active=True)
#toggle3.js_link('active', easter, 'visible')
#toggle4 = Toggle(label="Denmarks start reopening",button_type="success",active=True)
#toggle4.js_link('active', reopening, 'visible')
#show(layout([p], [toggle1,toggle2,toggle4]))
lockdown1_cit = Label(x=235, y=230, x_units='screen', y_units='screen',
text='First step of lockdown', render_mode='canvas',
border_line_color='black', border_line_alpha=1.0,
background_fill_color='white', background_fill_alpha=1.0,
text_font_size='8pt')
lockdown2_cit = Label(x=280, y=210, x_units='screen', y_units='screen',
text='Second step of lockdown', render_mode='canvas',
border_line_color='black', border_line_alpha=1.0,
background_fill_color='white', background_fill_alpha=1.0,
text_font_size='8pt')
easter_cit = Label(x=485, y=230, x_units='screen', y_units='screen',
text='Easter Break', render_mode='canvas',
border_line_color='black', border_line_alpha=1.0,
background_fill_color='white', background_fill_alpha=1.0,
text_font_size='8pt')
opening_cit = Label(x=600, y=230, x_units='screen', y_units='screen',
text='Opening', render_mode='canvas',
border_line_color='black', border_line_alpha=1.0,
background_fill_color='white', background_fill_alpha=1.0,
text_font_size='8pt')
p.add_layout(lockdown1_cit)
p.add_layout(lockdown2_cit)
p.add_layout(easter_cit)
p.add_layout(opening_cit)
show(p)
We can see that there indeed is a change in speed but is the change in speed equal for all municipalities?
The below choropleth map shows the percentage increase of speed during the lockdown for each municipality in the Urban Area of Copenhagen. First of all, it can be seen that the speed have increased in all municipalities but that the increase varies from 0.14% in Høje-Taastrup to 8.8% in Rødovre. -> One intuitive explanation could be that the municipalities with motorways going through would have bigger increases because there not would be any lines on the motorway. However, that does not seem to be the case because there for instance runs motorways through Rødovre and Høje-Taastrup which have the highest and lowest increase.
You can try to verify by yourself by removing and applying the choropleth map in the bottom right corner and seeing where the motorway (the red lines) go through the city. <- Overvejer at slette
import json
sel_mun = ["Albertslund","Ballerup","Brøndby","Dragør","Frederiksberg","Gentofte","Gladsaxe",
"Glostrup","Herlev","Hvidovre","Høje-Taastrup","Ishøj","København",
"Rødovre","Tårnby","Vallensbæk"]
path = '/Users/tuethomsen28/Google Drev/SocialDataVizz/Data'
with open(os.path.join(path,"big_cph_lon_lat.json")) as f:
big_cph = json.load(f)
speed = pd.read_csv(os.path.join(path,"speedPerMun.csv"),sep=";")
speed = speed.set_index("Date")
before_lockdown = speed[speed.index < "2020-03-11"]
before_lockdown = pd.DataFrame(before_lockdown.mean(),columns=["avg_speed_feb"])
before_lockdown = before_lockdown.reset_index()
before_lockdown = before_lockdown.rename(columns={"index":"Kommune"})
after_lockdown = speed[speed.index >= "2020-03-11"]
after_lockdown = pd.DataFrame(after_lockdown.mean(),columns=["avg_speed_mar"])
after_lockdown = after_lockdown.reset_index()
after_lockdown = after_lockdown.rename(columns={"index":"Kommune"})
avg_speed = before_lockdown.merge(after_lockdown,on="Kommune")
avg_speed = avg_speed.set_index("Kommune")
avg_speed["per_increase"] = ((avg_speed.avg_speed_mar-avg_speed.avg_speed_feb)/avg_speed.avg_speed_feb)*100
mun_map = folium.Map(location=[55.676098,12.568337],zoom_start=11,tiles='OpenStreetMap')
colormap = linear.YlOrRd_04.scale(avg_speed["per_increase"].min(),avg_speed["per_increase"].max())
df_dict = avg_speed["per_increase"]
folium.GeoJson(
big_cph,
name='Percentage increase in speed after lockdown',
style_function=lambda feature: {
'fillColor': colormap(df_dict[feature['properties']['KOMNAVN']]),
'color': 'black',
'weight': 1,
#'dashArray': '5, 5',
'fillOpacity': 1,
}
).add_to(mun_map)
folium.LayerControl(position="bottomright",collapsed=False).add_to(mun_map)
colormap.caption = 'Percentage increase in speed'
colormap.add_to(mun_map)
coordinates = [[55.693587,12.344985],[55.734238,12.364346],
[55.653268,12.418906],[55.592974,12.645970],
[55.682220,12.514292],[55.753021,12.552256],
[55.747834,12.472433],[55.671153,12.398190],
[55.735531,12.434457],[55.634567,12.465933],
[55.665301,12.266118],[55.623098,12.330763],
[55.711402,12.533741],[55.682886,12.448000],
[55.610549,12.600898],[55.636898,12.368548]]
coordinates_dict = dict(zip(sel_mun,coordinates))
for m in sel_mun:
folium.map.Marker(
coordinates_dict[m],
icon=DivIcon(
icon_size=(150,36),
icon_anchor=(25,0),
html=f'<div style="font-size: 10pt">{m}</div>',
)
).add_to(mun_map)
mun_map
#Plan B:
#<img src="https://tuelindhart.github.io/Transport_vs_covid19/Images/Choropleth_speed.png" style="width: 800px;"/>
You can see how much each municipality exactly has increased in speed. Notice how Høje-Taastrup seems to be an outlier compared to the rest of the municipalities.
path = '/Users/tuethomsen28/Google Drev/SocialDataVizz/Data'
focus_municipality = ["Albertslund","Ballerup","Brøndby","Dragør","Frederiksberg","Gentofte","Gladsaxe",
"Glostrup","Herlev","Hvidovre","Høje-Taastrup","Ishøj","København",
"Rødovre","Tårnby","Vallensbæk"]
df_hast_daily = pd.read_csv(os.path.join(path,"speedPerMun.csv"),sep=";")
df_hast_daily = df_hast_daily.set_index("Date")
df_avg_hast_pre_lockdown = df_hast_daily[df_hast_daily.index < "2020-03-11"].mean(0)
df_avg_hast_post_lockdown = df_hast_daily[df_hast_daily.index >= "2020-03-11"].mean(0)
df_avg_hast = pd.DataFrame((df_avg_hast_post_lockdown - df_avg_hast_pre_lockdown)/df_avg_hast_pre_lockdown,
columns=["pct_increase"])*100
df_avg_hast = df_avg_hast.reset_index()
df_avg_hast.columns = ["Municipality","pct_increase"]
df_avg_hast = df_avg_hast.set_index("Municipality")
#Creating format fitting for bokeh
source = ColumnDataSource(df_avg_hast)
#Creating a list for the x-axis on the bar-chart.
x_range = [str(h) for h in source.data['Municipality']]
#Generating 14 colors for the different focus crimes
color = bokeh.palettes.Category20[20][2]
bar ={} # to store vbars
#Creating figure frame with title, x -and y labels and disabling toolbar.
p = figure(x_range=FactorRange(factors=x_range),
title='Increase in speed per municipality',
x_axis_label='Municipality',
y_axis_label='Increase in speed (%)',
plot_width=700,plot_height=400,
toolbar_location = None)
r = p.vbar("Municipality", top="pct_increase", source= source, width=0.8,
fill_alpha=0.5,line_color=None,
color=color,visible=True)
p.xaxis.major_label_orientation = math.pi/4
mun = r.name #extracting the name of each rendered crime
hover = HoverTool(tooltips=[
('Increase in speed (%)' , '@pct_increase{0.2f} %') #@ - extracts the fractions from each focuscrime
], renderers=[r]) # assigns the hoved effect to the correct bar rendering
p.add_tools(hover)
p.y_range.start = 0 #setting the y range from zero
show(p)
Another way to measure the amount of public transport is by measuring the average number of passengers par day boarding busses.
On the barchart below is the avg. number of boarding passengers per municipality in the periods February 2019, March 2019, February 2020 and March 2020. If you click on the names February 2019 and March 2019 then you can see that there are most boarding passengers in Frederiksberg and Copenhagen while there is few in Vallensbæk and Dragør. You can also see that the number of avg. passengers in February and March are almost equal in 2019.
Now try to click on February 2020 and March 2020. Now the number of passengers dropped in all municipalities and some places it seems to be cut in half! This indicates that the drop in passengers is because of the lockdown and not an seasonal effect.
But how about yearly effects? The opening of the new metro in Copenhagen probably have an effect on the number of passengers using busses. If you click on February 2019 and February 2020 then you can see that the number of passengers stay mainly the same except in Gladsaxe, Høje-Taastrup, Frederiksberg and Copenhagen (København) where the drop in the latter two most likely are due to the opening of the metro.
sel_mun = ["Albertslund","Ballerup","Brøndby","Dragør","Frederiksberg","Gentofte","Gladsaxe",
"Glostrup","Herlev","Hvidovre","Høje-Taastrup","Ishøj","København",
"Rødovre","Tårnby","Vallensbæk"]
path = '/Users/tuethomsen28/Google Drev/SocialDataVizz/Data/' + 'Passagertal_SenesteVersion.csv'
df = pd.read_csv(path,sep=";",
thousands=",")
df_201902 = df[["Kommune","På_201902","Af_201902"]].dropna()
df_201903 = df[["Kommune","På_201903","Af_201903"]].dropna()
df_202002 = df[["Kommune","På_202002","Af_202002"]].dropna()
df_202003 = df[["Kommune","På_202003","Af_202003"]].dropna()
passenger_df = df_201902.groupby(["Kommune"]).mean().loc[sel_mun]
passenger_df = passenger_df.join(df_201903.groupby(["Kommune"]).mean().loc[sel_mun],
on="Kommune")
passenger_df = passenger_df.join(df_202002.groupby(["Kommune"]).mean().loc[sel_mun],
on="Kommune")
passenger_df = passenger_df.join(df_202003.groupby(["Kommune"]).mean().loc[sel_mun],
on="Kommune")
#Creating format fitting for bokeh
source = ColumnDataSource(passenger_df)
#Creating a list for the x-axis on the bar-chart.
x_range = [str(h) for h in source.data['Kommune']]
#Generating 14 colors for the different focus crimes
colors = bokeh.palettes.Category20[8][0::2]
bar ={} # to store vbars
#Creating figure frame with title, x -and y labels and disabling toolbar.
p = figure(x_range=FactorRange(factors=x_range),
title='Barchart of average number of onboarding passengers',
x_axis_label='Municipalities',
y_axis_label='Avg. number of boarding passengers',
plot_width=950,plot_height=400,
toolbar_location=None)
#List of on - and offboarding passengers in february and march for 2019 and 2020
boardings = ["På_201902","På_201903","På_202002","På_202003"]
#Generating barcharts to p
for indx,i in enumerate(boardings):
bar[i] = p.vbar("Kommune", top=i, source= source, width=0.8,
fill_alpha=0.5,line_color=None,
color=colors[indx],visible=False)
p.xaxis.major_label_orientation = math.pi/4
legend_names = {"På_201902":"February 2019","På_201903":"March 2019",
"På_202002":"February 2020","På_202003":"March 2020",}
items = [(legend_names[i],[bar[i]]) for i in boardings] #Creating list of tuples with focus crime name and belonging bar chart.
legend = Legend(items=items,location=(0,152)) # Creating legends with 'items'
p.add_layout(legend,'right') # Adding legends to ´p´ and setting location.
for b in bar:
period = legend_names[b] #extracting the name of each rendered crime
hover = HoverTool(tooltips=[
# The name of the crime
('Avg. number of passengers: ' , '@{%s}' %b) #@ - extracts the fractions from each focuscrime
], renderers=[bar[b]]) # assigns the hoved effect to the correct bar rendering
p.add_tools(hover)
p.y_range.start = 0 #setting the y range from zero
p.legend.click_policy="hide"
p.legend.title = 'Click on names below to see barcharts'
show(p)
However, it can be challenging to see on the barchart above how much the number of passengers have dropped in each municipality. The barchart below shows the percentage of boarding passengers in March compared February in the corresponding year. The percentage between March 2019 and March 2020 is not compared because of the difference in passengers between February 2019 and 2020 for Frederiksberg, Copenhagen etc.
So if you click on Mar 2020 / Feb 2020 and hover over Vallensbæk you see that 68.86% passengers in March 2020 use the busses in relation to February 2020. For good measure you can compare it with the previous year Mar 2019/Feb 2019 and see that amount of passengers in 2019 roughly are the same.
Notice, that the percentage values for Mar 2020 / Feb 2020 not is uniform. Maybe a choropleth map can shed some light on the differences.
dif_df = pd.DataFrame()
dif_df["dif_2019"] = (passenger_df.På_201903-passenger_df.På_201902).div(passenger_df.På_201902)*100
dif_df["dif_2020"] = (passenger_df.På_202003-passenger_df.På_202002).div(passenger_df.På_202002)*100
dif_df["per_2019"] = (passenger_df.På_201903).div(passenger_df.På_201902)*100
dif_df["per_2020"] = (passenger_df.På_202003).div(passenger_df.På_202002)*100
dif_df["per_feb"] = (passenger_df.På_201902-passenger_df.På_202002).div(passenger_df.På_201902)*100
dif_df["per_mar"] = (passenger_df.På_201903-passenger_df.På_202003).div(passenger_df.På_201903)*100
#Creating format fitting for bokeh
source = ColumnDataSource(dif_df)
#Creating a list for the x-axis on the bar-chart.
x_range = [str(h) for h in source.data['Kommune']]
#Generating 14 colors for the different focus crimes
colors = bokeh.palettes.Category20[8][0::2]
bar ={} # to store vbars
#Creating figure frame with title, x -and y labels and disabling toolbar.
p = figure(x_range=FactorRange(factors=x_range),
title='Passengers in march in relation to february in 2019 and 2020',
x_axis_label='Municipalities',
y_axis_label='% passengers in March in relation to February',
plot_width=950,plot_height=400,
toolbar_location=None)
#List of on - and offboarding passengers in february and march for 2019 and 2020
years = ["per_2019","per_2020"]
#Generating barcharts to p
for indx,i in enumerate(years):
bar[i] = p.vbar("Kommune", top=i, source= source, width=0.8,
fill_alpha=0.5,line_color=None,
color=colors[indx],visible=False)
p.xaxis.major_label_orientation = math.pi/4
legend_names = {"per_2019":"Mar 2019 / Feb 2019",
"per_2020":"Mar 2020 / Feb 2020"}
items = [(legend_names[i],[bar[i]]) for i in years] #Creating list of tuples with focus crime name and belonging bar chart.
legend = Legend(items=items,location=(0,198)) # Creating legends with 'items'
p.add_layout(legend,'right') # Adding legends to ´p´ and setting location.
for b in bar:
period = legend_names[b] #extracting the name of each rendered crime
hover = HoverTool(tooltips=[
# The name of the crime
('% passengers in March in relation to February: ' , '@{%s}{0.2f}' %b) #@ - extracts the fractions from each focuscrime
], renderers=[bar[b]]) # assigns the hoved effect to the correct bar rendering
p.add_tools(hover)
p.legend.click_policy="hide"
p.legend.title = 'Click on names below to see barcharts'
show(p)
The choropleth map shows the percentage decrease from February 2020 to March 2020. Surprisingly, the municipality with the biggest decrease is Albertslund followed by Copenhagen (København) and Frederiksberg. København and Frederiksberg can be explained by there run many busses and it often is possible and easier to take the bike instead of a bus.
path = '/Users/tuethomsen28/Google Drev/SocialDataVizz/Data'
df = pd.read_csv(os.path.join(path,"passenger_decrease.csv"),index_col="Kommune")*100
with open(os.path.join(path,"big_cph_lon_lat.json")) as f:
big_cph = json.load(f)
mun_map = folium.Map(location=[55.676098,12.568337],zoom_start=11,tiles='OpenStreetMap')
from branca.colormap import linear
from folium.features import DivIcon
colormap = linear.YlOrRd_04.scale(df["pct_drop_2020"].min(),df["pct_drop_2020"].max())
df_dict = df["pct_drop_2020"]
mun_map = folium.Map(location=[55.676098,12.568337],zoom_start=10.5,tiles='OpenStreetMap')
folium.GeoJson(
big_cph,
name='Percentage decrease for boarding passengers',
style_function=lambda feature: {
'fillColor': colormap(df_dict[feature['properties']['KOMNAVN']]),
'color': 'black',
'weight': 1,
#'dashArray': '5, 5',
'fillOpacity': 1,
}
).add_to(mun_map)
folium.LayerControl(position="bottomright",collapsed=False).add_to(mun_map)
colormap.caption = 'Percentage decrease from march 2019 to march 2020'
colormap.add_to(mun_map)
coordinates = [[55.693587,12.344985],[55.734238,12.364346],
[55.653268,12.418906],[55.592974,12.645970],
[55.682220,12.514292],[55.753021,12.552256],
[55.747834,12.472433],[55.671153,12.398190],
[55.735531,12.434457],[55.634567,12.465933],
[55.665301,12.266118],[55.623098,12.330763],
[55.711402,12.533741],[55.682886,12.448000],
[55.610549,12.600898],[55.636898,12.368548]]
coordinates_dict = dict(zip(sel_mun,coordinates))
for m in sel_mun:
folium.map.Marker(
coordinates_dict[m],
icon=DivIcon(
icon_size=(150,36),
icon_anchor=(25,0),
html=f'<div style="font-size: 10pt">{m}</div>',
)
).add_to(mun_map)
mun_map
colormap = linear.YlOrRd_04.scale(df["pct_drop_2020"].min(),df["pct_drop_2020"].max())
df_dict = df["pct_drop_2020"]
mun_map = folium.Map(location=[55.676098,12.568337],zoom_start=10.5,tiles='OpenStreetMap')
folium.GeoJson(
big_cph,
name='Percentage decrease for boarding passengers',
style_function=lambda feature: {
'fillColor': colormap(df_dict[feature['properties']['KOMNAVN']]),
'color': 'black',
'weight': 1,
#'dashArray': '5, 5',
'fillOpacity': 1,
}
).add_to(mun_map)
folium.LayerControl(position="bottomright",collapsed=False).add_to(mun_map)
colormap.caption = 'Percentage decrease from February 2020 to March 2020'
colormap.add_to(mun_map)
coordinates = [[55.693587,12.344985],[55.734238,12.364346],
[55.653268,12.418906],[55.592974,12.645970],
[55.682220,12.514292],[55.753021,12.552256],
[55.747834,12.472433],[55.671153,12.398190],
[55.735531,12.434457],[55.634567,12.465933],
[55.665301,12.266118],[55.623098,12.330763],
[55.711402,12.533741],[55.682886,12.448000],
[55.610549,12.600898],[55.636898,12.368548]]
coordinates_dict = dict(zip(sel_mun,coordinates))
for m in sel_mun:
folium.map.Marker(
coordinates_dict[m],
icon=DivIcon(
icon_size=(150,36),
icon_anchor=(25,0),
html=f'<div style="font-size: 10pt">{m}</div>',
)
).add_to(mun_map)
mun_map
The data shows that the increase in speed and decrease in passengers are different in the municipalities. The question is if the demographic of the municipalities can describe these differences. One way to explore this is by plotting a scatter plot with the explaining variable on the x-axis and the variable you want to explain on the y-axis. In this way it is possible to spot trend and relashionships between two variables.
Below there is 4 scatter plots with the percentage increase in speed plotted on the y-axis. The 4 demographic variables are:
First of all, when looking looking after a correlation, it is preferable to have more data points than 16 because you can risk with a few data points that a trend is random. Therefore, remember these explorations of the variables only can give an indication of possible trends.
Try to look at the 4 graphs and see if you think some of the variables can explain the different increase in speed in each municipality. You can hover over the graphs and see which municipality it is and what there exact values are on the x -and y axis. You can also mark a data point and it will highlight where it is positioned on the other graphs. Can you find the position of Høje-Taastrup on all the graphs. Do you think it is an outlier?
Also, have you noticed that you can make the x-axis for Density logarithmic? Can you see what it does to the graph if you click on it? Exactly, the points become more linear. That means the relashionship between Density and % increase in speed appers to be logarithmic! Can you see if there appears to be linear relashions on the other graphs?
path = '/Users/tuethomsen28/Google Drev/SocialDataVizz/Data'
df = pd.read_csv(os.path.join(path,"allVariables.csv"),index_col="Municipality")
df["log_Density"] = np.log(df["Density"])
df = df[["Density","DagNatRatio","pct_highly_educated","pct_not_vestern","log_Density","speed_change"]]
df["DagNatRatio"] = round(df["DagNatRatio"]*100,2)
df["speed_change"] = round(df["speed_change"]*100,2)
df["pct_highly_educated"] = round(df["pct_highly_educated"]*100,2)
df["pct_not_vestern"] = round(df["pct_not_vestern"]*100,2)
# Calculating linear regression lines #################################################################################
linear_regressor = LinearRegression()
df_temp = pd.DataFrame()
#Generating 14 colors for the different focus crimes
color = bokeh.palettes.Category20[20][::2]
for x in df.columns:
X = df[x].to_numpy().reshape(-1,1)
Y = df.speed_change.to_numpy().reshape(-1,1)
linear_regressor.fit(X,Y)
Y_pred = linear_regressor.predict(X)
Y_pred = list(Y_pred.squeeze())
name = "prediction_" + x
df_temp[name] = Y_pred
df_temp = df_temp.set_index(df.index)
df = df.join(df_temp,on="Municipality")
########### Create figure #########################################################################################################
source = ColumnDataSource(data = df)
f = [0]*5
toggle = [0]*10
var_dict = {"Density":" Density (persons/km^2)",
"DagNatRatio":"Day/Night %",
"pct_highly_educated":"% with high education",
"pct_not_vestern":"% with non-vestern background",
"log_Density": "Log Density (persons/km^2)"}
for i,var in enumerate(df.columns[:5]):
name = var_dict[var]
TOOLTIPS = [("Municipality: ", "@Municipality"),
(name, f"@{var}"),
("Increase in speed: ","@speed_change{0.2f}")]
# create figure
name = var_dict[var]
f[i] = figure(plot_width=400,plot_height=350,
title = f'Speed increase vs. {name}' ,
x_axis_label=f'{name}',
y_axis_label='% increase in speed',
background_fill_color = "white",background_fill_alpha = 0.8,
border_fill_color = "white", border_fill_alpha = 0.8, tooltips=TOOLTIPS,
#y_axis_type="log",
x_axis_type= x_axis_type,
toolbar_location = None,
tools = "box_select",
)
## Apply R^2 label ##############################################################################################
cor = df[["speed_change",var]].corr()
R_sqrt = round(cor.iloc[0,1]**2,3)
R_sqrt_Label = Label(x=200, y=5, x_units='screen', y_units='screen',
text=f' R^2 value: {R_sqrt} ', render_mode='canvas',
border_line_color='black', border_line_alpha=1.0,
background_fill_color='white', background_fill_alpha=1.0,
text_font_size='12pt',visible=False
#text_font="times"
)
# Create plot
f[i].circle(x = var, y = 'speed_change', size=10,source=source,color=color[i])
predict_name = "prediction_" + var
#Create linear regression line
line = f[i].line(x=var, y = predict_name,source=source,color="grey",line_width=2,visible=False)
f[i].add_layout(R_sqrt_Label)
toggle1 = Toggle(label="Show regression line", button_type="default", active=False,max_width=400)
toggle1.js_link('active', line, 'visible')
toggle2 = Toggle(label="Show R^2 value", button_type="default", active=False,width=400)
toggle2.js_link('active', R_sqrt_Label, 'visible')
toggle[i] = toggle1
toggle[i+5] = toggle2
toggles1 = column(toggle[0],toggle[5])
toggles2 = column(toggle[1],toggle[6])
toggles3 = column(toggle[2],toggle[7])
toggles4 = column(toggle[3],toggle[8])
toggles5 = column(toggle[4],toggle[9])
tab1 = Panel(child=f[0],title="Linear x-axis")
tab2 = Panel(child=f[4],title="Logarithmic x-axis")
tab3 = Panel(child=toggles1,title="Linear")
tab4 = Panel(child=toggles5,title="Log")
tab1 = Tabs(tabs=[tab1,tab2])
tab2 = Tabs(tabs=[tab3,tab4])
l = layout([[tab1,f[1]],
[tab2,toggles2],
[f[2],f[3]],
[toggles3,toggles4]],
)
show(l)
Sometimes it is helpful to use linear regression to see how well you can make a linear line fit the data. In other words, we want to see how well we can predict the percentage change in speed by our 4 variables. Try to click on the 4 buttons Show regression line and see how the lines fit the data. As suspected there is a positive linear relashionship between the log(Density) and a negative linear relashionship with% with non-vestern background while there is a small correlation with % with high education and with Day/Night %
However, just because the slope of the line is negative or positive it does not necessarily mean that the line predicts the data well. A helpful measure is the $R^2$ with a value between 0 and 1 which basically measures how well the line explains variable y (%s change in speed in our case) where 0 denotes it does not explain the data at all and 1 denotes it perfectly fits the data. Try to click on the Show R^2 value and see how well the regression lines fit the data.
The highest $R^2$ value is for $log(Density)$ on 0.338 which means that 33.8% of the change in speed can be explained by the population density of the municipality. Notice, that the $R^2$ value lowers to 0.237 if we don't take the logarithm.
Taking into consideration that there probably are many factors that weight in on the % increase in speed, then a $R^2$ on 0.338 is pretty good! It intuively also makes sense because densily populated cities usually have more traffic jams. That the relashionship is logarithmic means that the density of the population at some point won't have the same effect on the % increase in speed
The lowest $R^2$ value is 0.0 for Day/Night % which is somewhat surprising. Maybe it is because the use of vehicles rarely is restricted to only to one municipality so when people work outside their municipality then their car also effects the traffic in other municipalities.
Now we will use the same variables and explore with the same variables if the % decrease in passengers is dependent on the same variables. By looking at the scatterplots, do you think it actually look like that the Day/Night Ratio explains % change in passengers the best which didn't explain the increase in speed at all! Try to click on the Show regression line and Show R^2 value to see if that is the case.
The Day/Night % has the highest $R^2$ value and explains 37.8% of the decrease in passengers. This indicates that the % decrease in passengers is dependent on how many people study or work outside the municipality. It also appears by looking at the $R^2$ value that Density can have an effect on % decrease in passengers. However, $R^2$ is not a perfect measure and most values are vertically cluttered together to the left with a few exceptions to the right. So it is possible that there are no correlation and two of the points randomly made it appear there is a small correlation.
path = '/Users/tuethomsen28/Google Drev/SocialDataVizz/Data'
df = pd.read_csv(os.path.join(path,"allVariables.csv"),index_col="Municipality")
df["log_Density"] = np.log(df["Density"])
df = df[["Density","DagNatRatio","pct_highly_educated","pct_not_vestern","log_Density","passenger_change"]]
df["DagNatRatio"] = round(df["DagNatRatio"]*100,2)
df["passenger_change"] = round(df["passenger_change"]*100,2)
df["pct_highly_educated"] = round(df["pct_highly_educated"]*100,2)
df["pct_not_vestern"] = round(df["pct_not_vestern"]*100,2)
# Calculating linear regression lines #################################################################################
linear_regressor = LinearRegression()
df_temp = pd.DataFrame()
#Generating 14 colors for the different focus crimes
color = bokeh.palettes.Category20[20][::2]
for x in df.columns:
X = df[x].to_numpy().reshape(-1,1)
Y = df.passenger_change.to_numpy().reshape(-1,1)
linear_regressor.fit(X,Y)
Y_pred = linear_regressor.predict(X)
Y_pred = list(Y_pred.squeeze())
name = "prediction_" + x
df_temp[name] = Y_pred
df_temp = df_temp.set_index(df.index)
df = df.join(df_temp,on="Municipality")
########### Create figure #########################################################################################################
source = ColumnDataSource(data = df)
f = [0]*5
toggle = [0]*10
var_dict = {"Density":" Density (persons/km^2)",
"DagNatRatio":"Day/Night Ratio",
"pct_highly_educated":"% with high education",
"pct_not_vestern":"% with non-vestern background",
"log_Density": "Log Density (persons/km^2)"}
for i,var in enumerate(df.columns[:5]):
name = var_dict[var]
TOOLTIPS = [("Municipality: ", "@Municipality"),
(name, f"@{var}"),
("Decrease in passengers: ","@passenger_change{0.2f}")]
# create figure
name = var_dict[var]
f[i] = figure(plot_width=400,plot_height=350,
title = f'Passenger decrease vs. {name}' ,
x_axis_label=f'{name}',
y_axis_label='% decrease in passengers',
background_fill_color = "white",background_fill_alpha = 0.8,
border_fill_color = "white", border_fill_alpha = 0.8, tooltips=TOOLTIPS,
#y_axis_type="log",
x_axis_type= x_axis_type,
toolbar_location = None,
tools = "box_select",
)
## Apply R^2 label ##############################################################################################
cor = df[["passenger_change",var]].corr()
R_sqrt = round(cor.iloc[0,1]**2,3)
R_sqrt_Label = Label(x=200, y=5, x_units='screen', y_units='screen',
text=f' R^2 value: {R_sqrt} ', render_mode='canvas',
border_line_color='black', border_line_alpha=1.0,
background_fill_color='white', background_fill_alpha=1.0,
text_font_size='12pt',visible=False
#text_font="times"
)
# Create plot
f[i].circle(x = var, y = 'passenger_change', size=10,source=source,color=color[i])
predict_name = "prediction_" + var
#Create linear regression line
line = f[i].line(x=var, y = predict_name,source=source,color="grey",line_width=2,visible=False)
f[i].add_layout(R_sqrt_Label)
toggle1 = Toggle(label="Show regression line", button_type="default", active=False,max_width=400)
toggle1.js_link('active', line, 'visible')
toggle2 = Toggle(label="Show R^2 value", button_type="default", active=False,width=400)
toggle2.js_link('active', R_sqrt_Label, 'visible')
toggle[i] = toggle1
toggle[i+5] = toggle2
toggles1 = column(toggle[0],toggle[5])
toggles2 = column(toggle[1],toggle[6])
toggles3 = column(toggle[2],toggle[7])
toggles4 = column(toggle[3],toggle[8])
toggles5 = column(toggle[4],toggle[9])
tab1 = Panel(child=f[0],title="Linear x-axis")
tab2 = Panel(child=f[4],title="Logarithmic x-axis")
tab3 = Panel(child=toggles1,title="Linear")
tab4 = Panel(child=toggles5,title="Log")
tab1 = Tabs(tabs=[tab1,tab2])
tab2 = Tabs(tabs=[tab3,tab4])
l = layout([[tab1,f[1]],
[tab2,toggles2],
[f[2],f[3]],
[toggles3,toggles4]],
)
show(l)
Do you remember that Albertslund stands out on the choropleth map? Try to find it on the scatter plot and mark it by clicking and dragging to see where it is on the other scatter plots. Do you think the variables explain Albertslund or does it appear to be an outlier?
It appears that none of the variables explains well why Albertslund stands out. This can be due to some special conditions that only apply to Albertslund. However, Albertslund seemingly affects the fitting of the line substaintially because because we have few observations.
For fun lets say we want to predict the decrease in passengers for all municipalities except Albertslund and see how well the lines fit the data which can be seen on the graphs below.
path = '/Users/tuethomsen28/Google Drev/SocialDataVizz/Data'
df = pd.read_csv(os.path.join(path,"allVariables.csv"),index_col="Municipality")
df["log_Density"] = np.log(df["Density"])
df = df[["Density","DagNatRatio","pct_highly_educated","pct_not_vestern","log_Density","passenger_change"]]
df["DagNatRatio"] = round(df["DagNatRatio"]*100,2)
df["passenger_change"] = round(df["passenger_change"]*100,2)
df["pct_highly_educated"] = round(df["pct_highly_educated"]*100,2)
df["pct_not_vestern"] = round(df["pct_not_vestern"]*100,2)
df = df[df.index != "Albertslund"]
# Calculating linear regression lines #################################################################################
linear_regressor = LinearRegression()
df_temp = pd.DataFrame()
#Generating 14 colors for the different focus crimes
color = bokeh.palettes.Category20[20][::2]
for x in df.columns:
X = df[x].to_numpy().reshape(-1,1)
Y = df.passenger_change.to_numpy().reshape(-1,1)
linear_regressor.fit(X,Y)
Y_pred = linear_regressor.predict(X)
Y_pred = list(Y_pred.squeeze())
name = "prediction_" + x
df_temp[name] = Y_pred
df_temp = df_temp.set_index(df.index)
df = df.join(df_temp,on="Municipality")
########### Create figure #########################################################################################################
source = ColumnDataSource(data = df)
f = [0]*5
toggle = [0]*10
var_dict = {"Density":" Density (persons/km^2)",
"DagNatRatio":"Day/Night Ratio",
"pct_highly_educated":"% with high education",
"pct_not_vestern":"% with non-vestern background",
"log_Density": "Log Density (persons/km^2)"}
for i,var in enumerate(df.columns[:5]):
name = var_dict[var]
TOOLTIPS = [("Municipality: ", "@Municipality"),
(name, f"@{var}"),
("Decrease in passengers: ","@passenger_change{0.2f}")]
# create figure
name = var_dict[var]
f[i] = figure(plot_width=400,plot_height=350,
title = f'Passenger decrease vs. {name}' ,
x_axis_label=f'{name}',
y_axis_label='% decrease in passengers',
background_fill_color = "white",background_fill_alpha = 0.8,
border_fill_color = "white", border_fill_alpha = 0.8, tooltips=TOOLTIPS,
#y_axis_type="log",
x_axis_type= x_axis_type,
toolbar_location = None,
tools = "box_select",
)
## Apply R^2 label ##############################################################################################
cor = df[["passenger_change",var]].corr()
R_sqrt = round(cor.iloc[0,1]**2,3)
R_sqrt_Label = Label(x=200, y=5, x_units='screen', y_units='screen',
text=f' R^2 value: {R_sqrt} ', render_mode='canvas',
border_line_color='black', border_line_alpha=1.0,
background_fill_color='white', background_fill_alpha=1.0,
text_font_size='12pt',visible=False
#text_font="times"
)
# Create plot
f[i].circle(x = var, y = 'passenger_change', size=10,source=source,color=color[i])
predict_name = "prediction_" + var
#Create linear regression line
line = f[i].line(x=var, y = predict_name,source=source,color="grey",line_width=2,visible=False)
f[i].add_layout(R_sqrt_Label)
toggle1 = Toggle(label="Show regression line", button_type="default", active=False,max_width=400)
toggle1.js_link('active', line, 'visible')
toggle2 = Toggle(label="Show R^2 value", button_type="default", active=False,width=400)
toggle2.js_link('active', R_sqrt_Label, 'visible')
toggle[i] = toggle1
toggle[i+5] = toggle2
toggles1 = column(toggle[0],toggle[5])
toggles2 = column(toggle[1],toggle[6])
toggles3 = column(toggle[2],toggle[7])
toggles4 = column(toggle[3],toggle[8])
toggles5 = column(toggle[4],toggle[9])
tab1 = Panel(child=f[0],title="Linear x-axis")
tab2 = Panel(child=f[4],title="Logarithmic x-axis")
tab3 = Panel(child=toggles1,title="Linear")
tab4 = Panel(child=toggles5,title="Log")
tab1 = Tabs(tabs=[tab1,tab2])
tab2 = Tabs(tabs=[tab3,tab4])
l = layout([[tab1,f[1]],
[tab2,toggles2],
[f[2],f[3]],
[toggles3,toggles4]],
)
show(l)
Now all regression lines explain minimum 21% of the variation of the decrease of passengers where % with high education goes from explaining 16.6% to 42.0%! One possible interpretation is that people with a high education easilier can work or study from home and that there are specific things going on in Albertslund that makes it deviate from the rest. However, what it most of all shows is how sensitive linear regression is when you have few data points and therefore we stress that the correlations we find at the best can be interpretated as an indication.
So far, we have gotten a better understanding of how a single variable alone affects the % speed and passenger change in the municipalities. However, we do not understand how the variables put together explains the % change. We also add some more variables which are % unemployed, disposable income and median age. which gives 7 variables in total. We cannot plot in more than 3D so we have to be smart about it. Luckily, Karl Pearson came up with a smart technique back in 1901 called Principal Component Analysis.
We won't go into details of how it works, but basically each Principal Component (PC) shows how variables explain variations in the data. So, if you click on the PC1 for % speed increase on the graph below we can tell you that if you have a municipality with the combination above average population density, above average unemployment, above average education level and below average % of non-vestern immigrants and median age then it is likely the increase in speed will be high. How could we tell that you might wonder?
Imagine that you have a checkbox in front of you where you can give a checkmark for each variable and the more you can check off, the more likely it is that there is a high increase in speed or high decrease in passengers.
You need to know 3 things to be able to 'check the boxes of':
Now we have a challenge for you. Click on PC3 for passenger decrease and answer this question. If we have a municipality with below average population density, above average of people present both day and night, below average unemployment and education level and below average median age, is the municipality then likely to have a large or small decrease in passengers? To help you keep track we have made a checkbox.
## Prepare data ##################################################################################
variables = ["log_Density","DagNatRatio","pct_unemployed","pct_highly_educated",
"pct_not_vestern","income","median_age",
]
change_variables = ["speed_change","passenger_change"]
sel_mun = [
"Albertslund",
"Ballerup","Brøndby","Dragør","Frederiksberg","Gentofte","Gladsaxe",
"Glostrup","Herlev","Hvidovre","Høje-Taastrup","Ishøj","København",
"Rødovre","Tårnby","Vallensbæk"]
path = '/Users/tuethomsen28/Google Drev/SocialDataVizz/Data/'
df = pd.read_csv(os.path.join(path,"allVariables.csv"),sep=",",index_col=0)
df["log_Density"] = np.log(df["Density"])
p = {}
change_dict = {"speed_change":"Speed increase (%)","passenger_change":"Passenger decrease (%)"}
for c in change_variables:
#df = df[df.index != "Albertslund"]
x = df[variables+[c]].values
x = StandardScaler().fit_transform(x)
df_std = pd.DataFrame(data = x, columns = variables+[c])
pca = PCA(n_components=4)
PC = pca.fit_transform(df_std)
components = pca.components_
PC_list = ["PC"+str(i) for i in range(1,5)]
PC_4D = pd.DataFrame(components,columns=variables+[c],index=PC_list).T
## Create plot ###################################################################################
#Creating format fitting for bokeh
source = ColumnDataSource(PC_4D)
#Creating a list for the x-axis on the bar-chart.
x_range = [str(h) for h in source.data['index']]
#Generating 14 colors for the different focus crimes
colors = bokeh.palettes.Category20[18][0::2]
bar ={} # to store vbars
#Creating figure frame with title, x -and y labels and disabling toolbar.
p[c] = figure(x_range=FactorRange(factors=x_range),
title=f'{change_dict[c]} Principal Component Coefficients',
x_axis_label='Variables',
y_axis_label='Coefficients',
plot_width=500,plot_height=400,
toolbar_location = None)
PCs = ["PC1","PC2","PC3","PC4"]
#Generating barcharts to p
for indx,i in enumerate(PCs):
bar[i] = p[c].vbar("index", top=i, source= source, width=0.5,
fill_alpha=0.5,line_color=None,
color=colors[indx],visible=False)
p[c].xaxis.major_label_orientation = math.pi/4
items = [(i,[bar[i]]) for i in PCs] #Creating list of tuples with focus crime name and belonging bar chart.
legend = Legend(items=items,location=(0,125),) # Creating legends with 'items'
p[c].add_layout(legend,"right") # Adding legends to ´p´ and setting location.
p[c].legend.click_policy="hide"
p[c].legend.title = 'Show PC'
show(row(p["speed_change"],p["passenger_change"]))
#checkbox_group = CheckboxGroup(
# labels=variables, active=[0, 1])
checkbox_group = CheckboxGroup(
labels=variables)
show(checkbox_group)
Click here to see the answer.
Actually, the Principal Components (PC) are axes in a coordinate system and the coefficients shown above control how low (-) or how high (+) they land on the PC's. The PC's ensure that there are as much variation as possible which means we can make it possible to see what makes the municipalities increase in speed and decrease in passengers apart. The plots are color coded so the darker the green the higher the change.
Try to hover over the darkest circle on the plot to the left and you will see that it is Rødovre with the highest increase in speed and that it scores positive on PC1 and negatively on PC3. Then try to hover over the lightest circle and you will see that it is Høje-Taastrup with the lowest increase in speed that is negative on PC1 and positve on PC3.
Now notice how the colors generally are light on one side of the dashed line and generally dark on the other side of the dashed line. This means that we simply by looking at where they land on the PC's can determine if the increase in speed or decrease in passengers is likely to become high or low!
############ prepare data ##############################################################################################
from branca.colormap import linear
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from bokeh.layouts import layout, gridplot, row, column, widgetbox
%matplotlib inline
variables = ["log_Density","DagNatRatio","pct_unemployed","pct_highly_educated",
"pct_not_vestern","income","median_age",
]
change_variables = ["speed_change","passenger_change"]
change_dict = {"speed_change":"Speed increase (%)","passenger_change":"Passenger decrease (%)"}
sel_mun = [
"Albertslund",
"Ballerup","Brøndby","Dragør","Frederiksberg","Gentofte","Gladsaxe",
"Glostrup","Herlev","Hvidovre","Høje-Taastrup","Ishøj","København",
"Rødovre","Tårnby","Vallensbæk"]
path = '/Users/tuethomsen28/Google Drev/SocialDataVizz/Data/'
df = pd.read_csv(os.path.join(path,"allVariables.csv"),sep=",",index_col=0)
df["log_Density"] = np.log(df["Density"])
#df = df[df.index != "Albertslund"]
for c in change_variables:
x = df[variables.copy()+[c]].copy().values
x = StandardScaler().fit_transform(x)
df_std = pd.DataFrame(data = x, columns = variables.copy()+[c],index=sel_mun)
pca = PCA(n_components=4)
PC_color = pca.fit_transform(df_std.copy())
df_color = pd.DataFrame(PC_color,index=sel_mun,columns=["PC1","PC2","PC3","PC4"])
colormap = linear.YlGn_09.scale(df[c].min(),df[c].max())
df_color[c] = df[c]*100
df_color["color"] = [colormap(df.loc[m,c]) for m in sel_mun]
#Create overview of values being above or below mean
df_above_below = (df_std > 0)
for cl in df_above_below.columns:
df_above_below[cl] = ["Above" if cl else "Below" for cl in df_above_below[cl]]
df_above_below = df_above_below.iloc[:,:-1]
df_color = df_color.reset_index()
df_color = df_color.join(df_above_below,on="index")
df_color = df_color.set_index("index")
########### Create figure #########################################################################################################
source = ColumnDataSource(data = df_color)
TOOLTIPS = [("Municipality: ", "@index"),
("% change", "@{%s}" %c),
("log Density","@log_Density"),
("Day/Night","@DagNatRatio"),
("Unemployment","@pct_unemployed"),
("Education","@pct_highly_educated"),
("Non-vestern","@pct_not_vestern"),
("Income", "@income"),
("Median age","@median_age")]
# create figure
p[c] = figure(plot_height = 500, plot_width = 500,
title = f'{change_dict[c]}: PC1 and PC3',
x_axis_label='Principal Component 1',
y_axis_label='Principal Component 3',
background_fill_color = "white",background_fill_alpha = 0.8,
border_fill_color = "white", border_fill_alpha = 0.8, tooltips=TOOLTIPS,
#y_axis_type="log"
toolbar_location = None,
y_range=(-2.5,2.7)
)
# Create plot
#s.circle(x = "PC3", y = 'PC4', size=15,source=source,color="speed_color",line_alpha=1,line_color="black")
p[c].circle(x = "PC1", y = 'PC3', size=15,source=source,color="color",line_alpha=1,line_color="black")
p[c].line([-2.5, 4.5], [-2, 2.1], line_color="grey",line_dash="dashed")
show(row(p["speed_change"],p["passenger_change"]))
So do you remember Albertslund seemed to be an outlier when doing linear regression? Let us see if we now can explain why it has a high change in passengers. If you hover over the darkest circle on the right plot you will find Alberslund. Albertslund almost lies on 0 at PC1 while it is the lowest point on PC3. So PC3 alone can explain why Albertslund have a high decrease in passengers. Let's look at its values.
Wait a little. They are in fact identical to the municipality described above so you have actually already explained why Albertslund has a high decrease in passengers.